#ifndef CUFFTDX_FFT_1728_FP32_INV_PTX_HPP
#define CUFFTDX_FFT_1728_FP32_INV_PTX_HPP



template<> __forceinline__ __device__ void cufftdx_private_function<402, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<819>;
.reg .b32 r<17>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %24;
mad.lo.s32 r3, r1, 13824, r2;
mov.u32 r4, %tid.x;
add.f32 f49, %37, %48;
add.f32 f50, %27, f49;
add.f32 f51, %39, %50;
add.f32 f52, %28, f51;
mul.f32 f53, f49, 0f3F000000;
sub.f32 f54, %27, f53;
sub.f32 f55, %39, %50;
mul.f32 f56, f55, 0fBF5DB3D7;
add.f32 f57, f56, f54;
sub.f32 f58, f54, f56;
mul.f32 f59, f51, 0f3F000000;
sub.f32 f60, %28, f59;
sub.f32 f61, %37, %48;
mul.f32 f62, f61, 0fBF5DB3D7;
sub.f32 f63, f60, f62;
add.f32 f64, f62, f60;
add.f32 f65, %43, %53;
add.f32 f66, %32, f65;
add.f32 f67, %44, %55;
add.f32 f68, %34, f67;
mul.f32 f69, f65, 0f3F000000;
sub.f32 f70, %32, f69;
sub.f32 f71, %44, %55;
mul.f32 f72, f71, 0fBF5DB3D7;
add.f32 f73, f72, f70;
sub.f32 f74, f70, f72;
mul.f32 f75, f67, 0f3F000000;
sub.f32 f76, %34, f75;
sub.f32 f77, %43, %53;
mul.f32 f78, f77, 0fBF5DB3D7;
sub.f32 f79, f76, f78;
add.f32 f80, f78, f76;
mul.f32 f81, f73, 0f3F000000;
mul.f32 f82, f79, 0f3F5DB3D7;
sub.f32 f83, f81, f82;
mul.f32 f84, f79, 0f3F000000;
fma.rn.f32 f85, f73, 0f3F5DB3D7, f84;
mul.f32 f86, f74, 0fBF000000;
mul.f32 f87, f80, 0f3F5DB3D7;
sub.f32 f88, f86, f87;
mul.f32 f89, f80, 0fBF000000;
fma.rn.f32 f90, f74, 0f3F5DB3D7, f89;
add.f32 f91, f50, f66;
add.f32 f92, f52, f68;
sub.f32 f93, f50, f66;
sub.f32 f94, f52, f68;
add.f32 f95, f57, f83;
add.f32 f96, f63, f85;
sub.f32 f97, f57, f83;
sub.f32 f98, f63, f85;
add.f32 f99, f58, f88;
add.f32 f100, f64, f90;
sub.f32 f101, f58, f88;
sub.f32 f102, f64, f90;
add.f32 f103, %40, %51;
add.f32 f104, %29, f103;
add.f32 f105, %42, %52;
add.f32 f106, %31, f105;
mul.f32 f107, f103, 0f3F000000;
sub.f32 f108, %29, f107;
sub.f32 f109, %42, %52;
mul.f32 f110, f109, 0fBF5DB3D7;
add.f32 f111, f110, f108;
sub.f32 f112, f108, f110;
mul.f32 f113, f105, 0f3F000000;
sub.f32 f114, %31, f113;
sub.f32 f115, %40, %51;
mul.f32 f116, f115, 0fBF5DB3D7;
sub.f32 f117, f114, f116;
add.f32 f118, f116, f114;
add.f32 f119, %45, %56;
add.f32 f120, %35, f119;
add.f32 f121, %47, %57;
add.f32 f122, %36, f121;
mul.f32 f123, f119, 0f3F000000;
sub.f32 f124, %35, f123;
sub.f32 f125, %47, %57;
mul.f32 f126, f125, 0fBF5DB3D7;
add.f32 f127, f126, f124;
sub.f32 f128, f124, f126;
mul.f32 f129, f121, 0f3F000000;
sub.f32 f130, %36, f129;
sub.f32 f131, %45, %56;
mul.f32 f132, f131, 0fBF5DB3D7;
sub.f32 f133, f130, f132;
add.f32 f134, f132, f130;
mul.f32 f135, f127, 0f3F000000;
mul.f32 f136, f133, 0f3F5DB3D7;
sub.f32 f137, f135, f136;
mul.f32 f138, f133, 0f3F000000;
fma.rn.f32 f139, f127, 0f3F5DB3D7, f138;
mul.f32 f140, f128, 0fBF000000;
mul.f32 f141, f134, 0f3F5DB3D7;
sub.f32 f142, f140, f141;
mul.f32 f143, f134, 0fBF000000;
fma.rn.f32 f144, f128, 0f3F5DB3D7, f143;
add.f32 f145, f104, f120;
add.f32 f146, f106, f122;
sub.f32 f147, f104, f120;
sub.f32 f148, f106, f122;
add.f32 f149, f111, f137;
add.f32 f150, f117, f139;
sub.f32 f151, f111, f137;
sub.f32 f152, f117, f139;
add.f32 f153, f112, f142;
add.f32 f154, f118, f144;
sub.f32 f155, f112, f142;
sub.f32 f156, f118, f144;
mul.f32 f157, f149, 0f3F5DB3D7;
mul.f32 f158, f150, 0f3F000000;
sub.f32 f159, f157, f158;
mul.f32 f160, f150, 0f3F5DB3D7;
fma.rn.f32 f161, f149, 0f3F000000, f160;
mul.f32 f162, f153, 0f3F000000;
mul.f32 f163, f154, 0f3F5DB3D7;
sub.f32 f164, f162, f163;
mul.f32 f165, f154, 0f3F000000;
fma.rn.f32 f166, f153, 0f3F5DB3D7, f165;
mul.f32 f167, f151, 0fBF000000;
mul.f32 f168, f152, 0f3F5DB3D7;
sub.f32 f169, f167, f168;
mul.f32 f170, f152, 0fBF000000;
fma.rn.f32 f171, f151, 0f3F5DB3D7, f170;
mul.f32 f172, f155, 0fBF5DB3D7;
mul.f32 f173, f156, 0f3F000000;
sub.f32 f174, f172, f173;
mul.f32 f175, f156, 0fBF5DB3D7;
fma.rn.f32 f176, f155, 0f3F000000, f175;
sub.f32 f177, f91, f145;
sub.f32 f178, f92, f146;
add.f32 f179, f95, f159;
add.f32 f180, f96, f161;
sub.f32 f181, f95, f159;
sub.f32 f182, f96, f161;
add.f32 f183, f99, f164;
add.f32 f184, f100, f166;
sub.f32 f185, f99, f164;
sub.f32 f186, f100, f166;
sub.f32 f187, f93, f148;
add.f32 f188, f94, f147;
add.f32 f189, f93, f148;
sub.f32 f190, f94, f147;
add.f32 f191, f97, f169;
add.f32 f192, f98, f171;
sub.f32 f193, f97, f169;
sub.f32 f194, f98, f171;
add.f32 f195, f101, f174;
add.f32 f196, f102, f176;
sub.f32 f197, f101, f174;
sub.f32 f198, f102, f176;
mul.wide.u32 rd2, r4, 954437177;
shr.u64 rd3, rd2, 37;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 144;
sub.s32 r7, r4, r6;
mad.lo.s32 r8, r5, 13824, r3;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %25;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f199, f200}, [rd6];
mul.f32 f203, f180, f200;
mul.f32 f204, f179, f200;
mul.f32 f205, f199, f180;
mul.f32 f206, f199, f199;
mul.f32 f207, f200, f200;
sub.f32 f208, f206, f207;
mul.f32 f209, f200, f199;
fma.rn.f32 f210, f200, f199, f209;
mul.f32 f211, f184, f210;
mul.f32 f212, f183, f210;
mul.f32 f213, f208, f184;
mul.f32 f214, f199, f208;
mul.f32 f215, f200, f210;
sub.f32 f216, f214, f215;
mul.f32 f217, f199, f210;
fma.rn.f32 f218, f200, f208, f217;
mul.f32 f219, f188, f218;
mul.f32 f220, f187, f218;
mul.f32 f221, f216, f188;
mul.f32 f222, f199, f216;
mul.f32 f223, f200, f218;
sub.f32 f224, f222, f223;
mul.f32 f225, f199, f218;
fma.rn.f32 f226, f200, f216, f225;
mul.f32 f227, f192, f226;
mul.f32 f228, f191, f226;
mul.f32 f229, f224, f192;
mul.f32 f230, f199, f224;
mul.f32 f231, f200, f226;
sub.f32 f232, f230, f231;
mul.f32 f233, f199, f226;
fma.rn.f32 f234, f200, f224, f233;
mul.f32 f235, f196, f234;
mul.f32 f236, f195, f234;
mul.f32 f237, f232, f196;
mul.f32 f238, f199, f232;
mul.f32 f239, f200, f234;
sub.f32 f240, f238, f239;
mul.f32 f241, f199, f234;
fma.rn.f32 f242, f200, f232, f241;
mul.f32 f243, f178, f242;
mul.f32 f244, f177, f242;
mul.f32 f245, f240, f178;
mul.f32 f246, f199, f240;
mul.f32 f247, f200, f242;
sub.f32 f248, f246, f247;
mul.f32 f249, f199, f242;
fma.rn.f32 f250, f200, f240, f249;
mul.f32 f251, f182, f250;
mul.f32 f252, f181, f250;
mul.f32 f253, f248, f182;
mul.f32 f254, f199, f248;
mul.f32 f255, f200, f250;
sub.f32 f256, f254, f255;
mul.f32 f257, f199, f250;
fma.rn.f32 f258, f200, f248, f257;
mul.f32 f259, f186, f258;
mul.f32 f260, f185, f258;
mul.f32 f261, f256, f186;
mul.f32 f262, f199, f256;
mul.f32 f263, f200, f258;
sub.f32 f264, f262, f263;
mul.f32 f265, f199, f258;
fma.rn.f32 f266, f200, f256, f265;
mul.f32 f267, f190, f266;
mul.f32 f268, f189, f266;
mul.f32 f269, f264, f190;
mul.f32 f270, f199, f264;
mul.f32 f271, f200, f266;
sub.f32 f272, f270, f271;
mul.f32 f273, f199, f266;
fma.rn.f32 f274, f200, f264, f273;
mul.f32 f275, f194, f274;
mul.f32 f276, f193, f274;
mul.f32 f277, f272, f194;
mul.f32 f278, f199, f272;
mul.f32 f279, f200, f274;
sub.f32 f280, f278, f279;
mul.f32 f281, f199, f274;
fma.rn.f32 f282, f200, f272, f281;
mul.f32 f283, f198, f282;
mul.f32 f284, f197, f282;
mul.f32 f285, f280, f198;
barrier.sync 0;
mad.lo.s32 r9, r7, 96, r8;
add.f32 f286, f92, f146;
add.f32 f287, f91, f145;
fma.rn.f32 f288, f199, f179, f203;
sub.f32 f289, f205, f204;
st.shared.v4.f32 [r9], {f287, f286, f288, f289};
fma.rn.f32 f290, f208, f183, f211;
sub.f32 f291, f213, f212;
sub.f32 f292, f221, f220;
fma.rn.f32 f293, f216, f187, f219;
st.shared.v4.f32 [r9+16], {f290, f291, f293, f292};
sub.f32 f294, f229, f228;
fma.rn.f32 f295, f224, f191, f227;
fma.rn.f32 f296, f232, f195, f235;
sub.f32 f297, f237, f236;
st.shared.v4.f32 [r9+32], {f295, f294, f296, f297};
fma.rn.f32 f298, f240, f177, f243;
sub.f32 f299, f245, f244;
fma.rn.f32 f300, f248, f181, f251;
sub.f32 f301, f253, f252;
st.shared.v4.f32 [r9+48], {f298, f299, f300, f301};
fma.rn.f32 f302, f256, f185, f259;
sub.f32 f303, f261, f260;
fma.rn.f32 f304, f264, f189, f267;
sub.f32 f305, f269, f268;
st.shared.v4.f32 [r9+64], {f302, f303, f304, f305};
fma.rn.f32 f306, f272, f193, f275;
sub.f32 f307, f277, f276;
fma.rn.f32 f308, f280, f197, f283;
sub.f32 f309, f285, f284;
st.shared.v4.f32 [r9+80], {f306, f307, f308, f309};
barrier.sync 0;
mad.lo.s32 r10, r7, -88, r9;
ld.shared.v2.f32 {f310, f311}, [r10];
ld.shared.v2.f32 {f314, f315}, [r10+1152];
ld.shared.v2.f32 {f318, f319}, [r10+2304];
ld.shared.v2.f32 {f322, f323}, [r10+3456];
ld.shared.v2.f32 {f326, f327}, [r10+4608];
ld.shared.v2.f32 {f330, f331}, [r10+5760];
ld.shared.v2.f32 {f334, f335}, [r10+6912];
ld.shared.v2.f32 {f338, f339}, [r10+8064];
ld.shared.v2.f32 {f342, f343}, [r10+9216];
ld.shared.v2.f32 {f346, f347}, [r10+10368];
ld.shared.v2.f32 {f350, f351}, [r10+11520];
ld.shared.v2.f32 {f354, f355}, [r10+12672];
add.f32 f358, f326, f342;
add.f32 f359, f310, f358;
add.f32 f360, f327, f343;
add.f32 f361, f311, f360;
mul.f32 f362, f358, 0f3F000000;
sub.f32 f363, f310, f362;
sub.f32 f364, f327, f343;
mul.f32 f365, f364, 0fBF5DB3D7;
add.f32 f366, f365, f363;
sub.f32 f367, f363, f365;
mul.f32 f368, f360, 0f3F000000;
sub.f32 f369, f311, f368;
sub.f32 f370, f326, f342;
mul.f32 f371, f370, 0fBF5DB3D7;
sub.f32 f372, f369, f371;
add.f32 f373, f371, f369;
add.f32 f374, f334, f350;
add.f32 f375, f318, f374;
add.f32 f376, f335, f351;
add.f32 f377, f319, f376;
mul.f32 f378, f374, 0f3F000000;
sub.f32 f379, f318, f378;
sub.f32 f380, f335, f351;
mul.f32 f381, f380, 0fBF5DB3D7;
add.f32 f382, f381, f379;
sub.f32 f383, f379, f381;
mul.f32 f384, f376, 0f3F000000;
sub.f32 f385, f319, f384;
sub.f32 f386, f334, f350;
mul.f32 f387, f386, 0fBF5DB3D7;
sub.f32 f388, f385, f387;
add.f32 f389, f387, f385;
mul.f32 f390, f382, 0f3F000000;
mul.f32 f391, f388, 0f3F5DB3D7;
sub.f32 f392, f390, f391;
mul.f32 f393, f388, 0f3F000000;
fma.rn.f32 f394, f382, 0f3F5DB3D7, f393;
mul.f32 f395, f383, 0fBF000000;
mul.f32 f396, f389, 0f3F5DB3D7;
sub.f32 f397, f395, f396;
mul.f32 f398, f389, 0fBF000000;
fma.rn.f32 f399, f383, 0f3F5DB3D7, f398;
add.f32 f400, f359, f375;
add.f32 f401, f361, f377;
sub.f32 f402, f359, f375;
sub.f32 f403, f361, f377;
add.f32 f404, f366, f392;
add.f32 f405, f372, f394;
sub.f32 f406, f366, f392;
sub.f32 f407, f372, f394;
add.f32 f408, f367, f397;
add.f32 f409, f373, f399;
sub.f32 f410, f367, f397;
sub.f32 f411, f373, f399;
add.f32 f412, f330, f346;
add.f32 f413, f314, f412;
add.f32 f414, f331, f347;
add.f32 f415, f315, f414;
mul.f32 f416, f412, 0f3F000000;
sub.f32 f417, f314, f416;
sub.f32 f418, f331, f347;
mul.f32 f419, f418, 0fBF5DB3D7;
add.f32 f420, f419, f417;
sub.f32 f421, f417, f419;
mul.f32 f422, f414, 0f3F000000;
sub.f32 f423, f315, f422;
sub.f32 f424, f330, f346;
mul.f32 f425, f424, 0fBF5DB3D7;
sub.f32 f426, f423, f425;
add.f32 f427, f425, f423;
add.f32 f428, f338, f354;
add.f32 f429, f322, f428;
add.f32 f430, f339, f355;
add.f32 f431, f323, f430;
mul.f32 f432, f428, 0f3F000000;
sub.f32 f433, f322, f432;
sub.f32 f434, f339, f355;
mul.f32 f435, f434, 0fBF5DB3D7;
add.f32 f436, f435, f433;
sub.f32 f437, f433, f435;
mul.f32 f438, f430, 0f3F000000;
sub.f32 f439, f323, f438;
sub.f32 f440, f338, f354;
mul.f32 f441, f440, 0fBF5DB3D7;
sub.f32 f442, f439, f441;
add.f32 f443, f441, f439;
mul.f32 f444, f436, 0f3F000000;
mul.f32 f445, f442, 0f3F5DB3D7;
sub.f32 f446, f444, f445;
mul.f32 f447, f442, 0f3F000000;
fma.rn.f32 f448, f436, 0f3F5DB3D7, f447;
mul.f32 f449, f437, 0fBF000000;
mul.f32 f450, f443, 0f3F5DB3D7;
sub.f32 f451, f449, f450;
mul.f32 f452, f443, 0fBF000000;
fma.rn.f32 f453, f437, 0f3F5DB3D7, f452;
add.f32 f454, f413, f429;
add.f32 f455, f415, f431;
sub.f32 f456, f413, f429;
sub.f32 f457, f415, f431;
add.f32 f458, f420, f446;
add.f32 f459, f426, f448;
sub.f32 f460, f420, f446;
sub.f32 f461, f426, f448;
add.f32 f462, f421, f451;
add.f32 f463, f427, f453;
sub.f32 f464, f421, f451;
sub.f32 f465, f427, f453;
mul.f32 f466, f458, 0f3F5DB3D7;
mul.f32 f467, f459, 0f3F000000;
sub.f32 f468, f466, f467;
mul.f32 f469, f459, 0f3F5DB3D7;
fma.rn.f32 f470, f458, 0f3F000000, f469;
mul.f32 f471, f462, 0f3F000000;
mul.f32 f472, f463, 0f3F5DB3D7;
sub.f32 f473, f471, f472;
mul.f32 f474, f463, 0f3F000000;
fma.rn.f32 f475, f462, 0f3F5DB3D7, f474;
mul.f32 f476, f460, 0fBF000000;
mul.f32 f477, f461, 0f3F5DB3D7;
sub.f32 f478, f476, f477;
mul.f32 f479, f461, 0fBF000000;
fma.rn.f32 f480, f460, 0f3F5DB3D7, f479;
mul.f32 f481, f464, 0fBF5DB3D7;
mul.f32 f482, f465, 0f3F000000;
sub.f32 f483, f481, f482;
mul.f32 f484, f465, 0fBF5DB3D7;
fma.rn.f32 f485, f464, 0f3F000000, f484;
sub.f32 f486, f400, f454;
sub.f32 f487, f401, f455;
add.f32 f488, f404, f468;
add.f32 f489, f405, f470;
sub.f32 f490, f404, f468;
sub.f32 f491, f405, f470;
add.f32 f492, f408, f473;
add.f32 f493, f409, f475;
sub.f32 f494, f408, f473;
sub.f32 f495, f409, f475;
sub.f32 f496, f402, f457;
add.f32 f497, f403, f456;
add.f32 f498, f402, f457;
sub.f32 f499, f403, f456;
add.f32 f500, f406, f478;
add.f32 f501, f407, f480;
sub.f32 f502, f406, f478;
sub.f32 f503, f407, f480;
add.f32 f504, f410, f483;
add.f32 f505, f411, f485;
sub.f32 f506, f410, f483;
sub.f32 f507, f411, f485;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 12;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %26;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f508, f509}, [rd11];
mul.f32 f512, f489, f509;
mul.f32 f513, f488, f509;
mul.f32 f514, f508, f489;
mul.f32 f515, f508, f508;
mul.f32 f516, f509, f509;
sub.f32 f517, f515, f516;
mul.f32 f518, f509, f508;
fma.rn.f32 f519, f509, f508, f518;
mul.f32 f520, f493, f519;
mul.f32 f521, f492, f519;
mul.f32 f522, f517, f493;
mul.f32 f523, f508, f517;
mul.f32 f524, f509, f519;
sub.f32 f525, f523, f524;
mul.f32 f526, f508, f519;
fma.rn.f32 f527, f509, f517, f526;
mul.f32 f528, f497, f527;
mul.f32 f529, f496, f527;
mul.f32 f530, f525, f497;
mul.f32 f531, f508, f525;
mul.f32 f532, f509, f527;
sub.f32 f533, f531, f532;
mul.f32 f534, f508, f527;
fma.rn.f32 f535, f509, f525, f534;
mul.f32 f536, f501, f535;
mul.f32 f537, f500, f535;
mul.f32 f538, f533, f501;
mul.f32 f539, f508, f533;
mul.f32 f540, f509, f535;
sub.f32 f541, f539, f540;
mul.f32 f542, f508, f535;
fma.rn.f32 f543, f509, f533, f542;
mul.f32 f544, f505, f543;
mul.f32 f545, f504, f543;
mul.f32 f546, f541, f505;
mul.f32 f547, f508, f541;
mul.f32 f548, f509, f543;
sub.f32 f549, f547, f548;
mul.f32 f550, f508, f543;
fma.rn.f32 f551, f509, f541, f550;
mul.f32 f552, f487, f551;
mul.f32 f553, f486, f551;
mul.f32 f554, f549, f487;
mul.f32 f555, f508, f549;
mul.f32 f556, f509, f551;
sub.f32 f557, f555, f556;
mul.f32 f558, f508, f551;
fma.rn.f32 f559, f509, f549, f558;
mul.f32 f560, f491, f559;
mul.f32 f561, f490, f559;
mul.f32 f562, f557, f491;
mul.f32 f563, f508, f557;
mul.f32 f564, f509, f559;
sub.f32 f565, f563, f564;
mul.f32 f566, f508, f559;
fma.rn.f32 f567, f509, f557, f566;
mul.f32 f568, f495, f567;
mul.f32 f569, f494, f567;
mul.f32 f570, f565, f495;
mul.f32 f571, f508, f565;
mul.f32 f572, f509, f567;
sub.f32 f573, f571, f572;
mul.f32 f574, f508, f567;
fma.rn.f32 f575, f509, f565, f574;
mul.f32 f576, f499, f575;
mul.f32 f577, f498, f575;
mul.f32 f578, f573, f499;
mul.f32 f579, f508, f573;
mul.f32 f580, f509, f575;
sub.f32 f581, f579, f580;
mul.f32 f582, f508, f575;
fma.rn.f32 f583, f509, f573, f582;
mul.f32 f584, f503, f583;
mul.f32 f585, f502, f583;
mul.f32 f586, f581, f503;
mul.f32 f587, f508, f581;
mul.f32 f588, f509, f583;
sub.f32 f589, f587, f588;
mul.f32 f590, f508, f583;
fma.rn.f32 f591, f509, f581, f590;
mul.f32 f592, f507, f591;
mul.f32 f593, f506, f591;
mul.f32 f594, f589, f507;
shl.b32 r14, r13, 3;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 1152, r15;
add.f32 f595, f401, f455;
add.f32 f596, f400, f454;
st.shared.v2.f32 [r16], {f596, f595};
fma.rn.f32 f597, f508, f488, f512;
sub.f32 f598, f514, f513;
st.shared.v2.f32 [r16+96], {f597, f598};
fma.rn.f32 f599, f517, f492, f520;
sub.f32 f600, f522, f521;
st.shared.v2.f32 [r16+192], {f599, f600};
fma.rn.f32 f601, f525, f496, f528;
sub.f32 f602, f530, f529;
st.shared.v2.f32 [r16+288], {f601, f602};
fma.rn.f32 f603, f533, f500, f536;
sub.f32 f604, f538, f537;
st.shared.v2.f32 [r16+384], {f603, f604};
fma.rn.f32 f605, f541, f504, f544;
sub.f32 f606, f546, f545;
st.shared.v2.f32 [r16+480], {f605, f606};
fma.rn.f32 f607, f549, f486, f552;
sub.f32 f608, f554, f553;
st.shared.v2.f32 [r16+576], {f607, f608};
fma.rn.f32 f609, f557, f490, f560;
sub.f32 f610, f562, f561;
st.shared.v2.f32 [r16+672], {f609, f610};
fma.rn.f32 f611, f565, f494, f568;
sub.f32 f612, f570, f569;
st.shared.v2.f32 [r16+768], {f611, f612};
fma.rn.f32 f613, f573, f498, f576;
sub.f32 f614, f578, f577;
st.shared.v2.f32 [r16+864], {f613, f614};
fma.rn.f32 f615, f581, f502, f584;
sub.f32 f616, f586, f585;
st.shared.v2.f32 [r16+960], {f615, f616};
fma.rn.f32 f617, f589, f506, f592;
sub.f32 f618, f594, f593;
st.shared.v2.f32 [r16+1056], {f617, f618};
barrier.sync 0;
ld.shared.v2.f32 {f619, f620}, [r10];
ld.shared.v2.f32 {f623, f624}, [r10+1152];
ld.shared.v2.f32 {f627, f628}, [r10+2304];
ld.shared.v2.f32 {f631, f632}, [r10+3456];
ld.shared.v2.f32 {f635, f636}, [r10+4608];
ld.shared.v2.f32 {f639, f640}, [r10+5760];
ld.shared.v2.f32 {f643, f644}, [r10+6912];
ld.shared.v2.f32 {f647, f648}, [r10+8064];
ld.shared.v2.f32 {f651, f652}, [r10+9216];
ld.shared.v2.f32 {f655, f656}, [r10+10368];
ld.shared.v2.f32 {f659, f660}, [r10+11520];
ld.shared.v2.f32 {f663, f664}, [r10+12672];
add.f32 f667, f635, f651;
add.f32 f668, f619, f667;
add.f32 f669, f636, f652;
add.f32 f670, f620, f669;
mul.f32 f671, f667, 0f3F000000;
sub.f32 f672, f619, f671;
sub.f32 f673, f636, f652;
mul.f32 f674, f673, 0fBF5DB3D7;
add.f32 f675, f674, f672;
sub.f32 f676, f672, f674;
mul.f32 f677, f669, 0f3F000000;
sub.f32 f678, f620, f677;
sub.f32 f679, f635, f651;
mul.f32 f680, f679, 0fBF5DB3D7;
sub.f32 f681, f678, f680;
add.f32 f682, f680, f678;
add.f32 f683, f643, f659;
add.f32 f684, f627, f683;
add.f32 f685, f644, f660;
add.f32 f686, f628, f685;
mul.f32 f687, f683, 0f3F000000;
sub.f32 f688, f627, f687;
sub.f32 f689, f644, f660;
mul.f32 f690, f689, 0fBF5DB3D7;
add.f32 f691, f690, f688;
sub.f32 f692, f688, f690;
mul.f32 f693, f685, 0f3F000000;
sub.f32 f694, f628, f693;
sub.f32 f695, f643, f659;
mul.f32 f696, f695, 0fBF5DB3D7;
sub.f32 f697, f694, f696;
add.f32 f698, f696, f694;
mul.f32 f699, f691, 0f3F000000;
mul.f32 f700, f697, 0f3F5DB3D7;
sub.f32 f701, f699, f700;
mul.f32 f702, f697, 0f3F000000;
fma.rn.f32 f703, f691, 0f3F5DB3D7, f702;
mul.f32 f704, f692, 0fBF000000;
mul.f32 f705, f698, 0f3F5DB3D7;
sub.f32 f706, f704, f705;
mul.f32 f707, f698, 0fBF000000;
fma.rn.f32 f708, f692, 0f3F5DB3D7, f707;
add.f32 f709, f668, f684;
add.f32 f710, f670, f686;
sub.f32 f711, f668, f684;
sub.f32 f712, f670, f686;
add.f32 f713, f675, f701;
add.f32 f714, f681, f703;
sub.f32 f715, f675, f701;
sub.f32 f716, f681, f703;
add.f32 f717, f676, f706;
add.f32 f718, f682, f708;
sub.f32 f719, f676, f706;
sub.f32 f720, f682, f708;
add.f32 f721, f639, f655;
add.f32 f722, f623, f721;
add.f32 f723, f640, f656;
add.f32 f724, f624, f723;
mul.f32 f725, f721, 0f3F000000;
sub.f32 f726, f623, f725;
sub.f32 f727, f640, f656;
mul.f32 f728, f727, 0fBF5DB3D7;
add.f32 f729, f728, f726;
sub.f32 f730, f726, f728;
mul.f32 f731, f723, 0f3F000000;
sub.f32 f732, f624, f731;
sub.f32 f733, f639, f655;
mul.f32 f734, f733, 0fBF5DB3D7;
sub.f32 f735, f732, f734;
add.f32 f736, f734, f732;
add.f32 f737, f647, f663;
add.f32 f738, f631, f737;
add.f32 f739, f648, f664;
add.f32 f740, f632, f739;
mul.f32 f741, f737, 0f3F000000;
sub.f32 f742, f631, f741;
sub.f32 f743, f648, f664;
mul.f32 f744, f743, 0fBF5DB3D7;
add.f32 f745, f744, f742;
sub.f32 f746, f742, f744;
mul.f32 f747, f739, 0f3F000000;
sub.f32 f748, f632, f747;
sub.f32 f749, f647, f663;
mul.f32 f750, f749, 0fBF5DB3D7;
sub.f32 f751, f748, f750;
add.f32 f752, f750, f748;
mul.f32 f753, f745, 0f3F000000;
mul.f32 f754, f751, 0f3F5DB3D7;
sub.f32 f755, f753, f754;
mul.f32 f756, f751, 0f3F000000;
fma.rn.f32 f757, f745, 0f3F5DB3D7, f756;
mul.f32 f758, f746, 0fBF000000;
mul.f32 f759, f752, 0f3F5DB3D7;
sub.f32 f760, f758, f759;
mul.f32 f761, f752, 0fBF000000;
fma.rn.f32 f762, f746, 0f3F5DB3D7, f761;
add.f32 f763, f722, f738;
add.f32 f764, f724, f740;
sub.f32 f765, f722, f738;
sub.f32 f766, f724, f740;
add.f32 f767, f729, f755;
add.f32 f768, f735, f757;
sub.f32 f769, f729, f755;
sub.f32 f770, f735, f757;
add.f32 f771, f730, f760;
add.f32 f772, f736, f762;
sub.f32 f773, f730, f760;
sub.f32 f774, f736, f762;
mul.f32 f775, f767, 0f3F5DB3D7;
mul.f32 f776, f768, 0f3F000000;
sub.f32 f777, f775, f776;
mul.f32 f778, f768, 0f3F5DB3D7;
fma.rn.f32 f779, f767, 0f3F000000, f778;
mul.f32 f780, f771, 0f3F000000;
mul.f32 f781, f772, 0f3F5DB3D7;
sub.f32 f782, f780, f781;
mul.f32 f783, f772, 0f3F000000;
fma.rn.f32 f784, f771, 0f3F5DB3D7, f783;
mul.f32 f785, f769, 0fBF000000;
mul.f32 f786, f770, 0f3F5DB3D7;
sub.f32 f787, f785, f786;
mul.f32 f788, f770, 0fBF000000;
fma.rn.f32 f789, f769, 0f3F5DB3D7, f788;
mul.f32 f790, f773, 0fBF5DB3D7;
mul.f32 f791, f774, 0f3F000000;
sub.f32 f792, f790, f791;
mul.f32 f793, f774, 0fBF5DB3D7;
fma.rn.f32 f794, f773, 0f3F000000, f793;
add.f32 %1, f710, f764;
add.f32 %0, f709, f763;
add.f32 %3, f714, f779;
add.f32 %2, f713, f777;
add.f32 %5, f718, f784;
add.f32 %4, f717, f782;
add.f32 %7, f712, f765;
sub.f32 %6, f711, f766;
add.f32 %9, f716, f789;
add.f32 %8, f715, f787;
add.f32 %11, f720, f794;
add.f32 %10, f719, f792;
sub.f32 %13, f710, f764;
sub.f32 %12, f709, f763;
sub.f32 %15, f714, f779;
sub.f32 %14, f713, f777;
sub.f32 %17, f718, f784;
sub.f32 %16, f717, f782;
sub.f32 %19, f712, f765;
add.f32 %18, f711, f766;
sub.f32 %21, f716, f789;
sub.f32 %20, f715, f787;
sub.f32 %23, f720, f794;
sub.f32 %22, f719, f792;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y));
};




template<> __forceinline__ __device__ void cufftdx_private_function<403, float, 1>(cufftdx::detail::complex<float> *rmem, unsigned smem){

asm volatile (R"({
.reg .f32 f<771>;
.reg .b32 r<17>;
.reg .b64 rd<12>;
mov.u32 r1, %tid.y;
mov.u32 r2, %24;
mad.lo.s32 r3, r1, 6912, r2;
mov.u32 r4, %tid.x;
add.f32 f49, %37, %48;
add.f32 f50, %27, f49;
add.f32 f51, %39, %50;
add.f32 f52, %28, f51;
mul.f32 f53, f49, 0f3F000000;
sub.f32 f54, %27, f53;
sub.f32 f55, %39, %50;
mul.f32 f56, f55, 0fBF5DB3D7;
add.f32 f57, f56, f54;
sub.f32 f58, f54, f56;
mul.f32 f59, f51, 0f3F000000;
sub.f32 f60, %28, f59;
sub.f32 f61, %37, %48;
mul.f32 f62, f61, 0fBF5DB3D7;
sub.f32 f63, f60, f62;
add.f32 f64, f62, f60;
add.f32 f65, %43, %53;
add.f32 f66, %32, f65;
add.f32 f67, %44, %55;
add.f32 f68, %34, f67;
mul.f32 f69, f65, 0f3F000000;
sub.f32 f70, %32, f69;
sub.f32 f71, %44, %55;
mul.f32 f72, f71, 0fBF5DB3D7;
add.f32 f73, f72, f70;
sub.f32 f74, f70, f72;
mul.f32 f75, f67, 0f3F000000;
sub.f32 f76, %34, f75;
sub.f32 f77, %43, %53;
mul.f32 f78, f77, 0fBF5DB3D7;
sub.f32 f79, f76, f78;
add.f32 f80, f78, f76;
mul.f32 f81, f73, 0f3F000000;
mul.f32 f82, f79, 0f3F5DB3D7;
sub.f32 f83, f81, f82;
mul.f32 f84, f79, 0f3F000000;
fma.rn.f32 f85, f73, 0f3F5DB3D7, f84;
mul.f32 f86, f74, 0fBF000000;
mul.f32 f87, f80, 0f3F5DB3D7;
sub.f32 f88, f86, f87;
mul.f32 f89, f80, 0fBF000000;
fma.rn.f32 f90, f74, 0f3F5DB3D7, f89;
add.f32 f91, f50, f66;
add.f32 f92, f52, f68;
sub.f32 f93, f50, f66;
sub.f32 f94, f52, f68;
add.f32 f95, f57, f83;
add.f32 f96, f63, f85;
sub.f32 f97, f57, f83;
sub.f32 f98, f63, f85;
add.f32 f99, f58, f88;
add.f32 f100, f64, f90;
sub.f32 f101, f58, f88;
sub.f32 f102, f64, f90;
add.f32 f103, %40, %51;
add.f32 f104, %29, f103;
add.f32 f105, %42, %52;
add.f32 f106, %31, f105;
mul.f32 f107, f103, 0f3F000000;
sub.f32 f108, %29, f107;
sub.f32 f109, %42, %52;
mul.f32 f110, f109, 0fBF5DB3D7;
add.f32 f111, f110, f108;
sub.f32 f112, f108, f110;
mul.f32 f113, f105, 0f3F000000;
sub.f32 f114, %31, f113;
sub.f32 f115, %40, %51;
mul.f32 f116, f115, 0fBF5DB3D7;
sub.f32 f117, f114, f116;
add.f32 f118, f116, f114;
add.f32 f119, %45, %56;
add.f32 f120, %35, f119;
add.f32 f121, %47, %57;
add.f32 f122, %36, f121;
mul.f32 f123, f119, 0f3F000000;
sub.f32 f124, %35, f123;
sub.f32 f125, %47, %57;
mul.f32 f126, f125, 0fBF5DB3D7;
add.f32 f127, f126, f124;
sub.f32 f128, f124, f126;
mul.f32 f129, f121, 0f3F000000;
sub.f32 f130, %36, f129;
sub.f32 f131, %45, %56;
mul.f32 f132, f131, 0fBF5DB3D7;
sub.f32 f133, f130, f132;
add.f32 f134, f132, f130;
mul.f32 f135, f127, 0f3F000000;
mul.f32 f136, f133, 0f3F5DB3D7;
sub.f32 f137, f135, f136;
mul.f32 f138, f133, 0f3F000000;
fma.rn.f32 f139, f127, 0f3F5DB3D7, f138;
mul.f32 f140, f128, 0fBF000000;
mul.f32 f141, f134, 0f3F5DB3D7;
sub.f32 f142, f140, f141;
mul.f32 f143, f134, 0fBF000000;
fma.rn.f32 f144, f128, 0f3F5DB3D7, f143;
add.f32 f145, f104, f120;
add.f32 f146, f106, f122;
sub.f32 f147, f104, f120;
sub.f32 f148, f106, f122;
add.f32 f149, f111, f137;
add.f32 f150, f117, f139;
sub.f32 f151, f111, f137;
sub.f32 f152, f117, f139;
add.f32 f153, f112, f142;
add.f32 f154, f118, f144;
sub.f32 f155, f112, f142;
sub.f32 f156, f118, f144;
mul.f32 f157, f149, 0f3F5DB3D7;
mul.f32 f158, f150, 0f3F000000;
sub.f32 f159, f157, f158;
mul.f32 f160, f150, 0f3F5DB3D7;
fma.rn.f32 f161, f149, 0f3F000000, f160;
mul.f32 f162, f153, 0f3F000000;
mul.f32 f163, f154, 0f3F5DB3D7;
sub.f32 f164, f162, f163;
mul.f32 f165, f154, 0f3F000000;
fma.rn.f32 f166, f153, 0f3F5DB3D7, f165;
mul.f32 f167, f151, 0fBF000000;
mul.f32 f168, f152, 0f3F5DB3D7;
sub.f32 f169, f167, f168;
mul.f32 f170, f152, 0fBF000000;
fma.rn.f32 f171, f151, 0f3F5DB3D7, f170;
mul.f32 f172, f155, 0fBF5DB3D7;
mul.f32 f173, f156, 0f3F000000;
sub.f32 f174, f172, f173;
mul.f32 f175, f156, 0fBF5DB3D7;
fma.rn.f32 f176, f155, 0f3F000000, f175;
add.f32 f177, f91, f145;
add.f32 f178, f92, f146;
sub.f32 f179, f91, f145;
sub.f32 f180, f92, f146;
add.f32 f181, f95, f159;
add.f32 f182, f96, f161;
sub.f32 f183, f95, f159;
sub.f32 f184, f96, f161;
add.f32 f185, f99, f164;
add.f32 f186, f100, f166;
sub.f32 f187, f99, f164;
sub.f32 f188, f100, f166;
sub.f32 f189, f93, f148;
add.f32 f190, f94, f147;
add.f32 f191, f93, f148;
sub.f32 f192, f94, f147;
add.f32 f193, f97, f169;
add.f32 f194, f98, f171;
sub.f32 f195, f97, f169;
sub.f32 f196, f98, f171;
add.f32 f197, f101, f174;
add.f32 f198, f102, f176;
sub.f32 f199, f101, f174;
sub.f32 f200, f102, f176;
mul.wide.u32 rd2, r4, 954437177;
shr.u64 rd3, rd2, 37;
cvt.u32.u64 r5, rd3;
mul.lo.s32 r6, r5, 144;
sub.s32 r7, r4, r6;
mul.wide.u32 rd4, r7, 8;
mov.u64 rd5, %25;
add.s64 rd6, rd5, rd4;
ld.global.v2.f32 {f201, f202}, [rd6];
mul.f32 f205, f182, f202;
fma.rn.f32 f206, f201, f181, f205;
mul.f32 f207, f181, f202;
mul.f32 f208, f201, f182;
sub.f32 f209, f208, f207;
mul.f32 f210, f201, f201;
mul.f32 f211, f202, f202;
sub.f32 f212, f210, f211;
mul.f32 f213, f202, f201;
fma.rn.f32 f214, f202, f201, f213;
mul.f32 f215, f186, f214;
fma.rn.f32 f216, f212, f185, f215;
mul.f32 f217, f185, f214;
mul.f32 f218, f212, f186;
sub.f32 f219, f218, f217;
mul.f32 f220, f201, f212;
mul.f32 f221, f202, f214;
sub.f32 f222, f220, f221;
mul.f32 f223, f201, f214;
fma.rn.f32 f224, f202, f212, f223;
mul.f32 f225, f190, f224;
fma.rn.f32 f226, f222, f189, f225;
mul.f32 f227, f189, f224;
mul.f32 f228, f222, f190;
sub.f32 f229, f228, f227;
mul.f32 f230, f201, f222;
mul.f32 f231, f202, f224;
sub.f32 f232, f230, f231;
mul.f32 f233, f201, f224;
fma.rn.f32 f234, f202, f222, f233;
mul.f32 f235, f194, f234;
fma.rn.f32 f236, f232, f193, f235;
mul.f32 f237, f193, f234;
mul.f32 f238, f232, f194;
sub.f32 f239, f238, f237;
mul.f32 f240, f201, f232;
mul.f32 f241, f202, f234;
sub.f32 f242, f240, f241;
mul.f32 f243, f201, f234;
fma.rn.f32 f244, f202, f232, f243;
mul.f32 f245, f198, f244;
fma.rn.f32 f246, f242, f197, f245;
mul.f32 f247, f197, f244;
mul.f32 f248, f242, f198;
sub.f32 f249, f248, f247;
mul.f32 f250, f201, f242;
mul.f32 f251, f202, f244;
sub.f32 f252, f250, f251;
mul.f32 f253, f201, f244;
fma.rn.f32 f254, f202, f242, f253;
mul.f32 f255, f180, f254;
fma.rn.f32 f256, f252, f179, f255;
mul.f32 f257, f179, f254;
mul.f32 f258, f252, f180;
sub.f32 f259, f258, f257;
mul.f32 f260, f201, f252;
mul.f32 f261, f202, f254;
sub.f32 f262, f260, f261;
mul.f32 f263, f201, f254;
fma.rn.f32 f264, f202, f252, f263;
mul.f32 f265, f184, f264;
fma.rn.f32 f266, f262, f183, f265;
mul.f32 f267, f183, f264;
mul.f32 f268, f262, f184;
sub.f32 f269, f268, f267;
mul.f32 f270, f201, f262;
mul.f32 f271, f202, f264;
sub.f32 f272, f270, f271;
mul.f32 f273, f201, f264;
fma.rn.f32 f274, f202, f262, f273;
mul.f32 f275, f188, f274;
fma.rn.f32 f276, f272, f187, f275;
mul.f32 f277, f187, f274;
mul.f32 f278, f272, f188;
sub.f32 f279, f278, f277;
mul.f32 f280, f201, f272;
mul.f32 f281, f202, f274;
sub.f32 f282, f280, f281;
mul.f32 f283, f201, f274;
fma.rn.f32 f284, f202, f272, f283;
mul.f32 f285, f192, f284;
fma.rn.f32 f286, f282, f191, f285;
mul.f32 f287, f191, f284;
mul.f32 f288, f282, f192;
sub.f32 f289, f288, f287;
mul.f32 f290, f201, f282;
mul.f32 f291, f202, f284;
sub.f32 f292, f290, f291;
mul.f32 f293, f201, f284;
fma.rn.f32 f294, f202, f282, f293;
mul.f32 f295, f196, f294;
fma.rn.f32 f296, f292, f195, f295;
mul.f32 f297, f195, f294;
mul.f32 f298, f292, f196;
sub.f32 f299, f298, f297;
mul.f32 f300, f201, f292;
mul.f32 f301, f202, f294;
sub.f32 f302, f300, f301;
mul.f32 f303, f201, f294;
fma.rn.f32 f304, f202, f292, f303;
mul.f32 f305, f200, f304;
fma.rn.f32 f306, f302, f199, f305;
mul.f32 f307, f199, f304;
mul.f32 f308, f302, f200;
sub.f32 f309, f308, f307;
mad.lo.s32 r8, r5, 6912, r3;
barrier.sync 0;
mad.lo.s32 r9, r7, 48, r8;
st.shared.v4.f32 [r9], {f177, f206, f216, f226};
st.shared.v4.f32 [r9+16], {f236, f246, f256, f266};
st.shared.v4.f32 [r9+32], {f276, f286, f296, f306};
barrier.sync 0;
mad.lo.s32 r10, r7, -44, r9;
ld.shared.f32 f310, [r10];
ld.shared.f32 f311, [r10+576];
ld.shared.f32 f312, [r10+1152];
ld.shared.f32 f313, [r10+1728];
ld.shared.f32 f314, [r10+2304];
ld.shared.f32 f315, [r10+2880];
ld.shared.f32 f316, [r10+3456];
ld.shared.f32 f317, [r10+4032];
ld.shared.f32 f318, [r10+4608];
ld.shared.f32 f319, [r10+5184];
ld.shared.f32 f320, [r10+5760];
ld.shared.f32 f321, [r10+6336];
barrier.sync 0;
st.shared.v4.f32 [r9], {f178, f209, f219, f229};
st.shared.v4.f32 [r9+16], {f239, f249, f259, f269};
st.shared.v4.f32 [r9+32], {f279, f289, f299, f309};
barrier.sync 0;
ld.shared.f32 f322, [r10];
ld.shared.f32 f323, [r10+576];
ld.shared.f32 f324, [r10+1152];
ld.shared.f32 f325, [r10+1728];
ld.shared.f32 f326, [r10+2304];
ld.shared.f32 f327, [r10+2880];
ld.shared.f32 f328, [r10+3456];
ld.shared.f32 f329, [r10+4032];
ld.shared.f32 f330, [r10+4608];
ld.shared.f32 f331, [r10+5184];
ld.shared.f32 f332, [r10+5760];
ld.shared.f32 f333, [r10+6336];
add.f32 f334, f314, f318;
add.f32 f335, f310, f334;
add.f32 f336, f326, f330;
add.f32 f337, f322, f336;
mul.f32 f338, f334, 0f3F000000;
sub.f32 f339, f310, f338;
sub.f32 f340, f326, f330;
mul.f32 f341, f340, 0fBF5DB3D7;
add.f32 f342, f341, f339;
sub.f32 f343, f339, f341;
mul.f32 f344, f336, 0f3F000000;
sub.f32 f345, f322, f344;
sub.f32 f346, f314, f318;
mul.f32 f347, f346, 0fBF5DB3D7;
sub.f32 f348, f345, f347;
add.f32 f349, f347, f345;
add.f32 f350, f316, f320;
add.f32 f351, f312, f350;
add.f32 f352, f328, f332;
add.f32 f353, f324, f352;
mul.f32 f354, f350, 0f3F000000;
sub.f32 f355, f312, f354;
sub.f32 f356, f328, f332;
mul.f32 f357, f356, 0fBF5DB3D7;
add.f32 f358, f357, f355;
sub.f32 f359, f355, f357;
mul.f32 f360, f352, 0f3F000000;
sub.f32 f361, f324, f360;
sub.f32 f362, f316, f320;
mul.f32 f363, f362, 0fBF5DB3D7;
sub.f32 f364, f361, f363;
add.f32 f365, f363, f361;
mul.f32 f366, f358, 0f3F000000;
mul.f32 f367, f364, 0f3F5DB3D7;
sub.f32 f368, f366, f367;
mul.f32 f369, f364, 0f3F000000;
fma.rn.f32 f370, f358, 0f3F5DB3D7, f369;
mul.f32 f371, f359, 0fBF000000;
mul.f32 f372, f365, 0f3F5DB3D7;
sub.f32 f373, f371, f372;
mul.f32 f374, f365, 0fBF000000;
fma.rn.f32 f375, f359, 0f3F5DB3D7, f374;
add.f32 f376, f335, f351;
add.f32 f377, f337, f353;
sub.f32 f378, f335, f351;
sub.f32 f379, f337, f353;
add.f32 f380, f342, f368;
add.f32 f381, f348, f370;
sub.f32 f382, f342, f368;
sub.f32 f383, f348, f370;
add.f32 f384, f343, f373;
add.f32 f385, f349, f375;
sub.f32 f386, f343, f373;
sub.f32 f387, f349, f375;
add.f32 f388, f315, f319;
add.f32 f389, f311, f388;
add.f32 f390, f327, f331;
add.f32 f391, f323, f390;
mul.f32 f392, f388, 0f3F000000;
sub.f32 f393, f311, f392;
sub.f32 f394, f327, f331;
mul.f32 f395, f394, 0fBF5DB3D7;
add.f32 f396, f395, f393;
sub.f32 f397, f393, f395;
mul.f32 f398, f390, 0f3F000000;
sub.f32 f399, f323, f398;
sub.f32 f400, f315, f319;
mul.f32 f401, f400, 0fBF5DB3D7;
sub.f32 f402, f399, f401;
add.f32 f403, f401, f399;
add.f32 f404, f317, f321;
add.f32 f405, f313, f404;
add.f32 f406, f329, f333;
add.f32 f407, f325, f406;
mul.f32 f408, f404, 0f3F000000;
sub.f32 f409, f313, f408;
sub.f32 f410, f329, f333;
mul.f32 f411, f410, 0fBF5DB3D7;
add.f32 f412, f411, f409;
sub.f32 f413, f409, f411;
mul.f32 f414, f406, 0f3F000000;
sub.f32 f415, f325, f414;
sub.f32 f416, f317, f321;
mul.f32 f417, f416, 0fBF5DB3D7;
sub.f32 f418, f415, f417;
add.f32 f419, f417, f415;
mul.f32 f420, f412, 0f3F000000;
mul.f32 f421, f418, 0f3F5DB3D7;
sub.f32 f422, f420, f421;
mul.f32 f423, f418, 0f3F000000;
fma.rn.f32 f424, f412, 0f3F5DB3D7, f423;
mul.f32 f425, f413, 0fBF000000;
mul.f32 f426, f419, 0f3F5DB3D7;
sub.f32 f427, f425, f426;
mul.f32 f428, f419, 0fBF000000;
fma.rn.f32 f429, f413, 0f3F5DB3D7, f428;
add.f32 f430, f389, f405;
add.f32 f431, f391, f407;
sub.f32 f432, f389, f405;
sub.f32 f433, f391, f407;
add.f32 f434, f396, f422;
add.f32 f435, f402, f424;
sub.f32 f436, f396, f422;
sub.f32 f437, f402, f424;
add.f32 f438, f397, f427;
add.f32 f439, f403, f429;
sub.f32 f440, f397, f427;
sub.f32 f441, f403, f429;
mul.f32 f442, f434, 0f3F5DB3D7;
mul.f32 f443, f435, 0f3F000000;
sub.f32 f444, f442, f443;
mul.f32 f445, f435, 0f3F5DB3D7;
fma.rn.f32 f446, f434, 0f3F000000, f445;
mul.f32 f447, f438, 0f3F000000;
mul.f32 f448, f439, 0f3F5DB3D7;
sub.f32 f449, f447, f448;
mul.f32 f450, f439, 0f3F000000;
fma.rn.f32 f451, f438, 0f3F5DB3D7, f450;
mul.f32 f452, f436, 0fBF000000;
mul.f32 f453, f437, 0f3F5DB3D7;
sub.f32 f454, f452, f453;
mul.f32 f455, f437, 0fBF000000;
fma.rn.f32 f456, f436, 0f3F5DB3D7, f455;
mul.f32 f457, f440, 0fBF5DB3D7;
mul.f32 f458, f441, 0f3F000000;
sub.f32 f459, f457, f458;
mul.f32 f460, f441, 0fBF5DB3D7;
fma.rn.f32 f461, f440, 0f3F000000, f460;
add.f32 f462, f376, f430;
add.f32 f463, f377, f431;
sub.f32 f464, f376, f430;
sub.f32 f465, f377, f431;
add.f32 f466, f380, f444;
add.f32 f467, f381, f446;
sub.f32 f468, f380, f444;
sub.f32 f469, f381, f446;
add.f32 f470, f384, f449;
add.f32 f471, f385, f451;
sub.f32 f472, f384, f449;
sub.f32 f473, f385, f451;
sub.f32 f474, f378, f433;
add.f32 f475, f379, f432;
add.f32 f476, f378, f433;
sub.f32 f477, f379, f432;
add.f32 f478, f382, f454;
add.f32 f479, f383, f456;
sub.f32 f480, f382, f454;
sub.f32 f481, f383, f456;
add.f32 f482, f386, f459;
add.f32 f483, f387, f461;
sub.f32 f484, f386, f459;
sub.f32 f485, f387, f461;
mul.wide.u32 rd7, r7, -1431655765;
shr.u64 rd8, rd7, 35;
cvt.u32.u64 r11, rd8;
mul.lo.s32 r12, r11, 12;
sub.s32 r13, r7, r12;
mul.wide.u32 rd9, r11, 8;
mov.u64 rd10, %26;
add.s64 rd11, rd10, rd9;
ld.global.v2.f32 {f486, f487}, [rd11];
mul.f32 f490, f467, f487;
fma.rn.f32 f491, f486, f466, f490;
mul.f32 f492, f466, f487;
mul.f32 f493, f486, f467;
sub.f32 f494, f493, f492;
mul.f32 f495, f486, f486;
mul.f32 f496, f487, f487;
sub.f32 f497, f495, f496;
mul.f32 f498, f487, f486;
fma.rn.f32 f499, f487, f486, f498;
mul.f32 f500, f471, f499;
fma.rn.f32 f501, f497, f470, f500;
mul.f32 f502, f470, f499;
mul.f32 f503, f497, f471;
sub.f32 f504, f503, f502;
mul.f32 f505, f486, f497;
mul.f32 f506, f487, f499;
sub.f32 f507, f505, f506;
mul.f32 f508, f486, f499;
fma.rn.f32 f509, f487, f497, f508;
mul.f32 f510, f475, f509;
fma.rn.f32 f511, f507, f474, f510;
mul.f32 f512, f474, f509;
mul.f32 f513, f507, f475;
sub.f32 f514, f513, f512;
mul.f32 f515, f486, f507;
mul.f32 f516, f487, f509;
sub.f32 f517, f515, f516;
mul.f32 f518, f486, f509;
fma.rn.f32 f519, f487, f507, f518;
mul.f32 f520, f479, f519;
fma.rn.f32 f521, f517, f478, f520;
mul.f32 f522, f478, f519;
mul.f32 f523, f517, f479;
sub.f32 f524, f523, f522;
mul.f32 f525, f486, f517;
mul.f32 f526, f487, f519;
sub.f32 f527, f525, f526;
mul.f32 f528, f486, f519;
fma.rn.f32 f529, f487, f517, f528;
mul.f32 f530, f483, f529;
fma.rn.f32 f531, f527, f482, f530;
mul.f32 f532, f482, f529;
mul.f32 f533, f527, f483;
sub.f32 f534, f533, f532;
mul.f32 f535, f486, f527;
mul.f32 f536, f487, f529;
sub.f32 f537, f535, f536;
mul.f32 f538, f486, f529;
fma.rn.f32 f539, f487, f527, f538;
mul.f32 f540, f465, f539;
fma.rn.f32 f541, f537, f464, f540;
mul.f32 f542, f464, f539;
mul.f32 f543, f537, f465;
sub.f32 f544, f543, f542;
mul.f32 f545, f486, f537;
mul.f32 f546, f487, f539;
sub.f32 f547, f545, f546;
mul.f32 f548, f486, f539;
fma.rn.f32 f549, f487, f537, f548;
mul.f32 f550, f469, f549;
fma.rn.f32 f551, f547, f468, f550;
mul.f32 f552, f468, f549;
mul.f32 f553, f547, f469;
sub.f32 f554, f553, f552;
mul.f32 f555, f486, f547;
mul.f32 f556, f487, f549;
sub.f32 f557, f555, f556;
mul.f32 f558, f486, f549;
fma.rn.f32 f559, f487, f547, f558;
mul.f32 f560, f473, f559;
fma.rn.f32 f561, f557, f472, f560;
mul.f32 f562, f472, f559;
mul.f32 f563, f557, f473;
sub.f32 f564, f563, f562;
mul.f32 f565, f486, f557;
mul.f32 f566, f487, f559;
sub.f32 f567, f565, f566;
mul.f32 f568, f486, f559;
fma.rn.f32 f569, f487, f557, f568;
mul.f32 f570, f477, f569;
fma.rn.f32 f571, f567, f476, f570;
mul.f32 f572, f476, f569;
mul.f32 f573, f567, f477;
sub.f32 f574, f573, f572;
mul.f32 f575, f486, f567;
mul.f32 f576, f487, f569;
sub.f32 f577, f575, f576;
mul.f32 f578, f486, f569;
fma.rn.f32 f579, f487, f567, f578;
mul.f32 f580, f481, f579;
fma.rn.f32 f581, f577, f480, f580;
mul.f32 f582, f480, f579;
mul.f32 f583, f577, f481;
sub.f32 f584, f583, f582;
mul.f32 f585, f486, f577;
mul.f32 f586, f487, f579;
sub.f32 f587, f585, f586;
mul.f32 f588, f486, f579;
fma.rn.f32 f589, f487, f577, f588;
mul.f32 f590, f485, f589;
fma.rn.f32 f591, f587, f484, f590;
mul.f32 f592, f484, f589;
mul.f32 f593, f587, f485;
sub.f32 f594, f593, f592;
shl.b32 r14, r13, 2;
add.s32 r15, r8, r14;
barrier.sync 0;
mad.lo.s32 r16, r11, 576, r15;
st.shared.f32 [r16], f462;
st.shared.f32 [r16+48], f491;
st.shared.f32 [r16+96], f501;
st.shared.f32 [r16+144], f511;
st.shared.f32 [r16+192], f521;
st.shared.f32 [r16+240], f531;
st.shared.f32 [r16+288], f541;
st.shared.f32 [r16+336], f551;
st.shared.f32 [r16+384], f561;
st.shared.f32 [r16+432], f571;
st.shared.f32 [r16+480], f581;
st.shared.f32 [r16+528], f591;
barrier.sync 0;
ld.shared.f32 f595, [r10];
ld.shared.f32 f596, [r10+576];
ld.shared.f32 f597, [r10+1152];
ld.shared.f32 f598, [r10+1728];
ld.shared.f32 f599, [r10+2304];
ld.shared.f32 f600, [r10+2880];
ld.shared.f32 f601, [r10+3456];
ld.shared.f32 f602, [r10+4032];
ld.shared.f32 f603, [r10+4608];
ld.shared.f32 f604, [r10+5184];
ld.shared.f32 f605, [r10+5760];
ld.shared.f32 f606, [r10+6336];
barrier.sync 0;
st.shared.f32 [r16], f463;
st.shared.f32 [r16+48], f494;
st.shared.f32 [r16+96], f504;
st.shared.f32 [r16+144], f514;
st.shared.f32 [r16+192], f524;
st.shared.f32 [r16+240], f534;
st.shared.f32 [r16+288], f544;
st.shared.f32 [r16+336], f554;
st.shared.f32 [r16+384], f564;
st.shared.f32 [r16+432], f574;
st.shared.f32 [r16+480], f584;
st.shared.f32 [r16+528], f594;
barrier.sync 0;
ld.shared.f32 f607, [r10];
ld.shared.f32 f608, [r10+576];
ld.shared.f32 f609, [r10+1152];
ld.shared.f32 f610, [r10+1728];
ld.shared.f32 f611, [r10+2304];
ld.shared.f32 f612, [r10+2880];
ld.shared.f32 f613, [r10+3456];
ld.shared.f32 f614, [r10+4032];
ld.shared.f32 f615, [r10+4608];
ld.shared.f32 f616, [r10+5184];
ld.shared.f32 f617, [r10+5760];
ld.shared.f32 f618, [r10+6336];
add.f32 f619, f599, f603;
add.f32 f620, f595, f619;
add.f32 f621, f611, f615;
add.f32 f622, f607, f621;
mul.f32 f623, f619, 0f3F000000;
sub.f32 f624, f595, f623;
sub.f32 f625, f611, f615;
mul.f32 f626, f625, 0fBF5DB3D7;
add.f32 f627, f626, f624;
sub.f32 f628, f624, f626;
mul.f32 f629, f621, 0f3F000000;
sub.f32 f630, f607, f629;
sub.f32 f631, f599, f603;
mul.f32 f632, f631, 0fBF5DB3D7;
sub.f32 f633, f630, f632;
add.f32 f634, f632, f630;
add.f32 f635, f601, f605;
add.f32 f636, f597, f635;
add.f32 f637, f613, f617;
add.f32 f638, f609, f637;
mul.f32 f639, f635, 0f3F000000;
sub.f32 f640, f597, f639;
sub.f32 f641, f613, f617;
mul.f32 f642, f641, 0fBF5DB3D7;
add.f32 f643, f642, f640;
sub.f32 f644, f640, f642;
mul.f32 f645, f637, 0f3F000000;
sub.f32 f646, f609, f645;
sub.f32 f647, f601, f605;
mul.f32 f648, f647, 0fBF5DB3D7;
sub.f32 f649, f646, f648;
add.f32 f650, f648, f646;
mul.f32 f651, f643, 0f3F000000;
mul.f32 f652, f649, 0f3F5DB3D7;
sub.f32 f653, f651, f652;
mul.f32 f654, f649, 0f3F000000;
fma.rn.f32 f655, f643, 0f3F5DB3D7, f654;
mul.f32 f656, f644, 0fBF000000;
mul.f32 f657, f650, 0f3F5DB3D7;
sub.f32 f658, f656, f657;
mul.f32 f659, f650, 0fBF000000;
fma.rn.f32 f660, f644, 0f3F5DB3D7, f659;
add.f32 f661, f620, f636;
add.f32 f662, f622, f638;
sub.f32 f663, f620, f636;
sub.f32 f664, f622, f638;
add.f32 f665, f627, f653;
add.f32 f666, f633, f655;
sub.f32 f667, f627, f653;
sub.f32 f668, f633, f655;
add.f32 f669, f628, f658;
add.f32 f670, f634, f660;
sub.f32 f671, f628, f658;
sub.f32 f672, f634, f660;
add.f32 f673, f600, f604;
add.f32 f674, f596, f673;
add.f32 f675, f612, f616;
add.f32 f676, f608, f675;
mul.f32 f677, f673, 0f3F000000;
sub.f32 f678, f596, f677;
sub.f32 f679, f612, f616;
mul.f32 f680, f679, 0fBF5DB3D7;
add.f32 f681, f680, f678;
sub.f32 f682, f678, f680;
mul.f32 f683, f675, 0f3F000000;
sub.f32 f684, f608, f683;
sub.f32 f685, f600, f604;
mul.f32 f686, f685, 0fBF5DB3D7;
sub.f32 f687, f684, f686;
add.f32 f688, f686, f684;
add.f32 f689, f602, f606;
add.f32 f690, f598, f689;
add.f32 f691, f614, f618;
add.f32 f692, f610, f691;
mul.f32 f693, f689, 0f3F000000;
sub.f32 f694, f598, f693;
sub.f32 f695, f614, f618;
mul.f32 f696, f695, 0fBF5DB3D7;
add.f32 f697, f696, f694;
sub.f32 f698, f694, f696;
mul.f32 f699, f691, 0f3F000000;
sub.f32 f700, f610, f699;
sub.f32 f701, f602, f606;
mul.f32 f702, f701, 0fBF5DB3D7;
sub.f32 f703, f700, f702;
add.f32 f704, f702, f700;
mul.f32 f705, f697, 0f3F000000;
mul.f32 f706, f703, 0f3F5DB3D7;
sub.f32 f707, f705, f706;
mul.f32 f708, f703, 0f3F000000;
fma.rn.f32 f709, f697, 0f3F5DB3D7, f708;
mul.f32 f710, f698, 0fBF000000;
mul.f32 f711, f704, 0f3F5DB3D7;
sub.f32 f712, f710, f711;
mul.f32 f713, f704, 0fBF000000;
fma.rn.f32 f714, f698, 0f3F5DB3D7, f713;
add.f32 f715, f674, f690;
add.f32 f716, f676, f692;
sub.f32 f717, f674, f690;
sub.f32 f718, f676, f692;
add.f32 f719, f681, f707;
add.f32 f720, f687, f709;
sub.f32 f721, f681, f707;
sub.f32 f722, f687, f709;
add.f32 f723, f682, f712;
add.f32 f724, f688, f714;
sub.f32 f725, f682, f712;
sub.f32 f726, f688, f714;
mul.f32 f727, f719, 0f3F5DB3D7;
mul.f32 f728, f720, 0f3F000000;
sub.f32 f729, f727, f728;
mul.f32 f730, f720, 0f3F5DB3D7;
fma.rn.f32 f731, f719, 0f3F000000, f730;
mul.f32 f732, f723, 0f3F000000;
mul.f32 f733, f724, 0f3F5DB3D7;
sub.f32 f734, f732, f733;
mul.f32 f735, f724, 0f3F000000;
fma.rn.f32 f736, f723, 0f3F5DB3D7, f735;
mul.f32 f737, f721, 0fBF000000;
mul.f32 f738, f722, 0f3F5DB3D7;
sub.f32 f739, f737, f738;
mul.f32 f740, f722, 0fBF000000;
fma.rn.f32 f741, f721, 0f3F5DB3D7, f740;
mul.f32 f742, f725, 0fBF5DB3D7;
mul.f32 f743, f726, 0f3F000000;
sub.f32 f744, f742, f743;
mul.f32 f745, f726, 0fBF5DB3D7;
fma.rn.f32 f746, f725, 0f3F000000, f745;
add.f32 %0, f661, f715;
add.f32 %1, f662, f716;
add.f32 %3, f666, f731;
add.f32 %2, f665, f729;
add.f32 %5, f670, f736;
add.f32 %4, f669, f734;
add.f32 %7, f664, f717;
sub.f32 %6, f663, f718;
add.f32 %9, f668, f741;
add.f32 %8, f667, f739;
add.f32 %11, f672, f746;
add.f32 %10, f671, f744;
sub.f32 %12, f661, f715;
sub.f32 %13, f662, f716;
sub.f32 %15, f666, f731;
sub.f32 %14, f665, f729;
sub.f32 %17, f670, f736;
sub.f32 %16, f669, f734;
sub.f32 %19, f664, f717;
add.f32 %18, f663, f718;
sub.f32 %21, f668, f741;
sub.f32 %20, f667, f739;
sub.f32 %23, f672, f746;
sub.f32 %22, f671, f744;
})"
     : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y));
};


#endif
